This document demonstrates the use of the bRF and LASSO-D3S functions for integrative GRN inference.
Those functions infer the regulatory pathways of Arabidopsis thaliana’s roots in response to nitrate (N) induction from Varala et al., 2018.
They use as inputs the expression profiles of N-responsive genes and TFBS information. Prior TFBS information was built by searching in the promoters of the N-responsive genes the PWM of the N-responsive regulators.
Import of the expression data and the N-responsive genes and regulators :
load('rdata/inference_input_N_response_varala.rdata')
genes <- input_data$grouped_genes; length(genes)
## [1] 1426
tfs <- input_data$grouped_regressors; length(tfs)
## [1] 201
counts <- input_data$counts; dim(counts)
## [1] 1426 45
load("rdata/pwm_occurrences_N_response_varala.rdata")
dim(pwm_occurrence)
## [1] 1426 201
ALPHAS <- seq(0,1, by = 0.1)
# subset <- sample(genes, replace = F, size = 20)
subset <- genes
nCores = 40
mats <- list()
nrep <- 100
for(alpha in ALPHAS){ # exploring PWM integration strength
for(rep in 1:nrep){ # exploring inherent variability
mat_lasso <- LASSO.D3S_inference(counts, genes, tfs,
alpha = alpha, N = 100,
int_pwm_noise = 0, score = "freq",
pwm_occurrence = pwm_occurrence,
nCores = nCores)
mat_lasso_perm <- LASSO.D3S_inference(counts, genes, tfs,
alpha = alpha, N = 100,
tf_expression_permutation = TRUE,
int_pwm_noise = 0, score = "freq",
pwm_occurrence = pwm_occurrence,
nCores = nCores)
mats[[paste0("LASSO_", as.character(alpha), '_trueData_', rep)]] <- mat_lasso
mats[[paste0("LASSO_", as.character(alpha), '_shuffled_', rep)]] <- mat_lasso_perm
}
}
save(mats, file = "results/100_permutations_lasso_importances_freq.rdata")
# load( "results/100_permutations_lasso_importances_freq.rdata")
edges <- list()
densities <- c(0.005, 0.01,0.05,0.075)
for(name in names(mats)){ # exploring importance threshold stringency
for(density in densities){
edges[[paste0(name, '_', density)]] <-
LASSO.D3S_network(mats[[name]], density = density, pwm_occurrence, genes, tfs, decreasing = TRUE)
}
}
save(edges, file = "results/100_permutations_lasso_edges_freq.rdata")
load("results/clusters_mse_lasso_100permutations.rdata")
positive_genes <- names(clusters_lasso[clusters_lasso==1])
edges <- list()
densities <- c(0.005, 0.01,0.05,0.075)
for(name in names(mats)){ # exploring importance threshold stringency
for(density in densities){
edges[[paste0(name, '_', density)]] <-
LASSO.D3S_network(mats[[name]][,positive_genes], density = density, pwm_occurrence, positive_genes, tfs, decreasing = TRUE)
}
}
save(edges, file = "results/100_permutations_lasso_edges_positive_same_density_freq.rdata")
color_palette = c("#C12131", "#EC5D2F", "#FE945C", "#FFC08E" )
prettyZero <- function(l){
max.decimals = max(nchar(str_extract(l, "\\.[0-9]+")), na.rm = T)-1
lnew = formatC(l, replace.zero = T, zero.print = "0",
digits = max.decimals, format = "f", preserve.width=T)
return(lnew)
}
settings <- c("method", "alpha", "dataset","rep", "density")
load("results/100_permutations_lasso_edges_freq.rdata")
# number of edges per network
nrows <-
data.frame(alpha_rep = names(unlist(lapply(edges, FUN = nrow))),
n_edges = unlist(lapply(edges, FUN = nrow)))
nrows[, settings] <-
str_split_fixed(nrows$alpha_rep, '_', length(settings))
edges_num <- lapply(edges, function(df)
df[sapply(df, is.numeric)])
pwm_support <-
data.frame(alpha_rep = names(unlist(lapply(edges_num, FUN = nrow))),
pwm = unlist(lapply(edges_num, FUN = colMeans)))
pwm_support[, settings] <-
str_split_fixed(pwm_support$alpha_rep, '_', length(settings))
pwm_support %>%
left_join(nrows, by = settings) %>%
mutate(alpha = as.numeric(alpha),
density_label = paste(density, ':', n_edges, 'edges')) %>%
ggplot(aes(color = density_label, x = alpha, y = pwm)) +
ggh4x::facet_nested_wrap(vars(method, density), ncol = 8, nest_line = T) +
geom_point() +
geom_smooth(aes(fill = density_label, linetype =dataset) , alpha = 0.1) +
theme(
strip.background = element_blank(),
axis.title.x = element_text(size = 22),
title = element_text(size = 16),
strip.text = element_text(size = 16),
legend.position = "top",
legend.text = element_text(size = 15),
axis.text = element_text(size = 12)
) +
xlab(expression(alpha)) + ylab("Mean PWM score in selected edges") +
ggtitle("Average PWM support of inferred edges") +
guides(color = guide_legend(nrow = 2, byrow = TRUE),
fill = guide_legend(nrow = 2, byrow = TRUE)) +
ylab(expression(paste("mean(", pi[tr], ")"))) +
scale_x_continuous(labels = prettyZero) +
scale_color_manual(name = "Network density", values = color_palette) +
scale_fill_manual(name = "Network density", values = color_palette)
val_conn <-
evaluate_networks(
edges,
validation = c("TARGET", "CHIPSeq", "DAPSeq"),
nCores = 25,
input_genes = genes,
input_tfs = tfs
)
val_conn[, settings] <-
str_split_fixed(val_conn$network_name, '_', length(settings))
save(val_conn, file = "results/100_permutations_lasso_validation_freq.rdata")
#
val_chip <-
evaluate_networks(
edges,
validation = c("CHIPSeq"),
nCores = 25,
input_genes = genes,
input_tfs = tfs
)
val_chip[, settings] <-
str_split_fixed(val_chip$network_name, '_', length(settings))
save(val_chip, file = "results/100_permutations_lasso_validation_chip_freq.rdata")
val_target <-
evaluate_networks(
edges,
validation = c("TARGET"),
nCores = 25,
input_genes = genes,
input_tfs = tfs
)
val_target[, settings] <-
str_split_fixed(val_target$network_name, '_', length(settings))
save(val_target, file = "results/100_permutations_lasso_validation_target_freq.rdata")
val_dap <-
evaluate_networks(
edges,
validation = c("DAPSeq"),
nCores = 35,
input_genes = genes,
input_tfs = tfs
)
val_dap[, settings] <-
str_split_fixed(val_dap$network_name, '_', length(settings))
save(val_dap, file = "results/100_permutations_lasso_validation_dap_freq.rdata")
get_nTF <- function(net){
return(mean(table(net$to)))
}
get_TF <- function(net){
return(length(unique(net$from)))
}
get_nGenes <- function(net){
return(length(unique(net$to)))
}
# subset <- edges[as.numeric(str_split_fixed(names(edges), '_', 5)[,4])<6]
topo <- data.frame(alpha_rep = names(edges),
n_TFs = unlist(lapply(edges, FUN = get_nTF)),
TFs = unlist(lapply(edges, FUN = get_TF)),
n_genes = unlist(lapply(edges, FUN = get_nGenes)))
topo[, settings] <- str_split_fixed(topo$alpha_rep,'_', length(settings))
data_topo <- topo %>%
left_join(nrows, by = settings) %>%
mutate(alpha = as.numeric(alpha)) %>%
mutate(density_label = paste(density, ':', n_edges, 'edges')) %>%
group_by(alpha, density, dataset) %>%
mutate(mean_n_TFs = mean(n_TFs, na.rm=T),
sd_n_TFs = sd(n_TFs, na.rm=T),
mean_TFs = mean(TFs, na.rm=T),
sd_TFs = sd(TFs, na.rm=T),
mean_nGenes = mean(n_genes, na.rm=T),
sd_nGenes = sd(n_genes, na.rm=T))
regs_per_genes <- data_topo %>%
ggplot(aes(color = dataset, fill = dataset,x = alpha, y = n_TFs)) +
ggh4x::facet_nested_wrap(vars(method, density), ncol = 8, nest_line = T) + geom_point(alpha = 0.2) +
geom_smooth(se = F) +
geom_ribbon(aes(ymin = mean_n_TFs - sd_n_TFs ,
ymax = mean_n_TFs + sd_n_TFs ),
alpha = .4)+
theme(
strip.background = element_blank(),
axis.title.x = element_text(size = 22),
title = element_text(size = 16),
strip.text = element_text(size = 16),
legend.text = element_text(size = 15),
axis.text = element_text(size = 12),
legend.position = 'none'
) +
scale_x_continuous(labels = prettyZero) +
xlab(expression(alpha)) + ylab("Average number of regulator per gene") +
ggtitle("Average number of regulator per gene")
regs_in_network <- data_topo %>%
ggplot(aes(color = dataset, fill = dataset, x = alpha, y = TFs)) +
ggh4x::facet_nested_wrap(vars(method, density), ncol = 8, nest_line = T) + geom_point(alpha = 0.2) +
geom_ribbon(aes(ymin = mean_TFs - sd_TFs ,
ymax = mean_TFs + sd_TFs ),
alpha = .4)+
theme(
strip.background = element_blank(),
axis.title.x = element_text(size = 22),
title = element_text(size = 16),
strip.text = element_text(size = 16),
legend.text = element_text(size = 15),
axis.text = element_text(size = 12),
legend.position = 'top'
) +
scale_x_continuous(labels = prettyZero) +
xlab(expression(alpha))+ ylab("Number of distinct regulators") +
ggtitle("Number of distinct regulators")
targets_in_network <- data_topo %>%
ggplot(aes(color = dataset, fill = dataset,x = alpha, y = n_genes)) +
ggh4x::facet_nested_wrap(vars(method, density), ncol = 8, nest_line = T) +
geom_point(alpha = 0.2) +
geom_smooth(se = F) +
geom_ribbon(aes(ymin = mean_nGenes - sd_nGenes ,
ymax = mean_nGenes + sd_nGenes ),
alpha = .4)+
theme(
strip.background = element_blank(),
axis.title.x = element_text(size = 22),
title = element_text(size = 16),
strip.text = element_text(size = 16),
legend.text = element_text(size = 15),
axis.text = element_text(size = 12),
legend.position = 'top'
) +
scale_x_continuous(labels = prettyZero) +
xlab(expression(alpha))+ ylab("Number of distinct targets") +
ggtitle("Number of distinct targets")
figure <- targets_in_network+regs_in_network+regs_per_genes + plot_layout(guides = "collect") + plot_annotation(tag_levels = 'a');figure
load("results/100_permutations_lasso_edges_positive_same_density_freq.rdata")
# number of edges per network
nrows <-
data.frame(alpha_rep = names(unlist(lapply(edges, FUN = nrow))),
n_edges = unlist(lapply(edges, FUN = nrow)))
nrows[, settings] <-
str_split_fixed(nrows$alpha_rep, '_', length(settings))
edges_num <- lapply(edges, function(df)
df[sapply(df, is.numeric)])
pwm_support <-
data.frame(alpha_rep = names(unlist(lapply(edges_num, FUN = nrow))),
pwm = unlist(lapply(edges_num, FUN = colMeans)))
pwm_support[, settings] <-
str_split_fixed(pwm_support$alpha_rep, '_', length(settings))
pwm_support %>%
left_join(nrows, by = settings) %>%
mutate(alpha = as.numeric(alpha),
density_label = paste(density, ':', n_edges, 'edges')) %>%
ggplot(aes(color = density_label, x = alpha, y = pwm)) +
ggh4x::facet_nested_wrap(vars(method, density), ncol = 8, nest_line = T) +
geom_point() +
geom_smooth(aes(fill = density_label, linetype =dataset) , alpha = 0.1) +
theme(
strip.background = element_blank(),
axis.title.x = element_text(size = 22),
title = element_text(size = 16),
strip.text = element_text(size = 16),
legend.position = "top",
legend.text = element_text(size = 15),
axis.text = element_text(size = 12)
) +
xlab(expression(alpha)) + ylab("Mean PWM score in selected edges") +
ggtitle("Average PWM support of inferred edges") +
guides(color = guide_legend(nrow = 2, byrow = TRUE),
fill = guide_legend(nrow = 2, byrow = TRUE)) +
ylab(expression(paste("mean(", pi[tr], ")"))) +
scale_x_continuous(labels = prettyZero)
get_nTF <- function(net){
return(mean(table(net$to)))
}
get_TF <- function(net){
return(length(unique(net$from)))
}
get_nGenes <- function(net){
return(length(unique(net$to)))
}
# subset <- edges[as.numeric(str_split_fixed(names(edges), '_', 5)[,4])<6]
topo <- data.frame(alpha_rep = names(edges),
n_TFs = unlist(lapply(edges, FUN = get_nTF)),
TFs = unlist(lapply(edges, FUN = get_TF)),
n_genes = unlist(lapply(edges, FUN = get_nGenes)))
topo[, settings] <- str_split_fixed(topo$alpha_rep,'_', length(settings))
data_topo <- topo %>%
left_join(nrows, by = settings) %>%
mutate(alpha = as.numeric(alpha)) %>%
mutate(density_label = paste(density, ':', n_edges, 'edges')) %>%
group_by(alpha, density, dataset) %>%
mutate(mean_n_TFs = mean(n_TFs, na.rm=T),
sd_n_TFs = sd(n_TFs, na.rm=T),
mean_TFs = mean(TFs, na.rm=T),
sd_TFs = sd(TFs, na.rm=T),
mean_nGenes = mean(n_genes, na.rm=T),
sd_nGenes = sd(n_genes, na.rm=T))
regs_per_genes <- data_topo %>%
ggplot(aes(color = dataset, fill = dataset,x = alpha, y = n_TFs)) +
ggh4x::facet_nested_wrap(vars(method, density), ncol = 8, nest_line = T) + geom_point(alpha = 0.2) +
geom_smooth(se = F) +
geom_ribbon(aes(ymin = mean_n_TFs - sd_n_TFs ,
ymax = mean_n_TFs + sd_n_TFs ),
alpha = .4)+
theme(
strip.background = element_blank(),
axis.title.x = element_text(size = 22),
title = element_text(size = 16),
strip.text = element_text(size = 16),
legend.text = element_text(size = 15),
axis.text = element_text(size = 12),
legend.position = 'none'
) +
scale_x_continuous(labels = prettyZero) +
xlab(expression(alpha)) + ylab("Average number of regulator per gene") +
ggtitle("Average number of regulator per gene")
regs_in_network <- data_topo %>%
ggplot(aes(color = dataset, fill = dataset, x = alpha, y = TFs)) +
ggh4x::facet_nested_wrap(vars(method, density), ncol = 8, nest_line = T) + geom_point(alpha = 0.2) +
geom_ribbon(aes(ymin = mean_TFs - sd_TFs ,
ymax = mean_TFs + sd_TFs ),
alpha = .4)+
theme(
strip.background = element_blank(),
axis.title.x = element_text(size = 22),
title = element_text(size = 16),
strip.text = element_text(size = 16),
legend.text = element_text(size = 15),
axis.text = element_text(size = 12),
legend.position = 'top'
) +
scale_x_continuous(labels = prettyZero) +
xlab(expression(alpha))+ ylab("Number of distinct regulators") +
ggtitle("Number of distinct regulators")
targets_in_network <- data_topo %>%
ggplot(aes(color = dataset, fill = dataset,x = alpha, y = n_genes)) +
ggh4x::facet_nested_wrap(vars(method, density), ncol = 8, nest_line = T) +
geom_point(alpha = 0.2) +
geom_smooth(se = F) +
geom_ribbon(aes(ymin = mean_nGenes - sd_nGenes ,
ymax = mean_nGenes + sd_nGenes ),
alpha = .4)+
theme(
strip.background = element_blank(),
axis.title.x = element_text(size = 22),
title = element_text(size = 16),
strip.text = element_text(size = 16),
legend.text = element_text(size = 15),
axis.text = element_text(size = 12),
legend.position = 'top'
) +
scale_x_continuous(labels = prettyZero) +
xlab(expression(alpha))+ ylab("Number of distinct targets") +
ggtitle("Number of distinct targets")
figure <- targets_in_network+regs_in_network+regs_per_genes + plot_layout(guides = "collect") + plot_annotation(tag_levels = 'a');figure
Not all densities should be explored with this restriction of genes, as there are too few genes to grant sufficient interactions supported by a prior of 1. We keep only the two smallest densities.
val_conn_pos <-
evaluate_networks(
edges,
validation = c("TARGET", "CHIPSeq", "DAPSeq"),
nCores = 25,
input_genes = genes,
input_tfs = tfs
)
val_conn_pos[, settings] <-
str_split_fixed(val_conn_pos$network_name, '_', length(settings))
save(val_conn_pos, file = "results/100_permutations_lasso_validation_cluster_pos_freq.rdata")
#
# val_chip <-
# evaluate_networks(
# edges,
# validation = c("CHIPSeq"),
# nCores = 25,
# input_genes = genes,
# input_tfs = tfs
# )
#
# val_chip[, settings] <-
# str_split_fixed(val_chip$network_name, '_', length(settings))
# save(val_chip, file = "results/100_permutations_lasso_validation_chip_cluster_pos_freq.rdata")
#
#
# val_target <-
# evaluate_networks(
# edges,
# validation = c("TARGET"),
# nCores = 25,
# input_genes = genes,
# input_tfs = tfs
# )
# val_target[, settings] <-
# str_split_fixed(val_target$network_name, '_', length(settings))
# save(val_target, file = "results/100_permutations_lasso_validation_target_cluster_pos.rdata")
val_dap <-
evaluate_networks(
edges,
validation = c("DAPSeq"),
nCores = 25,
input_genes = genes,
input_tfs = tfs
)
val_dap[, settings] <-
str_split_fixed(val_dap$network_name, '_', length(settings))
save(val_dap, file = "results/100_permutations_lasso_validation_dap_cluster_pos_freq.rdata")
draw_precision_recall <- function(validation, data_type){
data_val <- validation %>%
group_by(alpha, dataset, density) %>%
mutate(mean_precision = mean(precision, na.rm = T),
sd_precision = sd(precision, na.rm = T),
mean_recall = mean(recall, na.rm = T),
sd_recall = sd(recall, na.rm = T)) %>%
dplyr::select(-network_name) %>%
left_join(nrows, by = settings) %>%
mutate(alpha = as.numeric(alpha)) %>%
mutate(density_label = paste(density, ':', n_edges, 'edges'))
precision <- data_val %>%
ggplot(aes(
x = as.numeric(alpha),
y = precision,
color = dataset,
fill = dataset
))+
ggh4x::facet_nested_wrap(vars(method, density), ncol = 8, nest_line = T) +
geom_ribbon(aes(ymin = mean_precision - sd_precision ,
ymax = mean_precision + sd_precision ),
alpha = .4) +theme_pubr(legend = "top")+
geom_point(alpha = 0.1) + geom_smooth(se=F)+xlab("alpha") +
xlab(expression(alpha)) + ylab("Precision") +
ggtitle(paste("Precision against", data_type)) +
theme(
strip.background = element_blank(),
axis.title.x = element_text(size = 22),
title = element_text(size = 16),
strip.text = element_text(size = 16),
legend.text = element_text(size = 15),
axis.text = element_text(size = 12),
legend.position = 'top'
)
recall <- data_val%>%
ggplot(aes(
x = as.numeric(alpha),
y = recall,
color = dataset,
fill = dataset
))+
ggh4x::facet_nested_wrap(vars(method, density), ncol = 8,
nest_line = T, scales = "free") +
geom_ribbon(aes(ymin = mean_recall - sd_recall ,
ymax = mean_recall + sd_recall ),
alpha = .4) +theme_pubr(legend = "none")+
geom_point(alpha = 0.1) + geom_smooth(se=F)+xlab("alpha") +
xlab(expression(alpha)) + ylab("Recall") +
ggtitle(paste("Recall against", data_type)) +
theme(
strip.background = element_blank(),
axis.title.x = element_text(size = 22),
title = element_text(size = 16),
strip.text = element_text(size = 16),
legend.text = element_text(size = 15),
axis.text = element_text(size = 12),
legend.position = 'none'
)
return(precision/recall)
}
load("results/100_permutations_lasso_validation_freq.rdata")
draw_precision_recall(val_conn, "ConnecTF")
load(file = "results/100_permutations_lasso_validation_cluster_pos_freq.rdata")
draw_precision_recall(val_conn_pos, "ConnecTF, positive genes")
load("results/100_permutations_lasso_validation_chip.rdata")
draw_precision_recall(val_chip, "CHIP-Seq")
load(file = "results/100_permutations_lasso_validation_chip_cluster_pos.rdata")
# draw_precision_recall(val_chip, "CHIP-Seq, positive genes")
load("results/100_permutations_lasso_validation_target.rdata")
draw_precision_recall(val_target, "TARGET")
load(file = "results/100_permutations_lasso_validation_target_cluster_pos.rdata")
# draw_precision_recall(val_target, "TARGET, positive genes")
load("results/100_permutations_lasso_validation_dap_freq.rdata")
draw_precision_recall(val_dap, "DAP-Seq")
load(file = "results/100_permutations_lasso_validation_dap_cluster_pos_freq.rdata")
draw_precision_recall(val_dap, "DAP-Seq, positive genes")